library(tidyverse)
library(urltools)
library(lubridate)
library(stargazer)
library(lmtest)
library(sandwich)

# from archived
data_dir <- "../data/"
input_file <- paste0(data_dir, "news_domain_urls.csv")

urls <- read_csv(input_file)
urls <- urls %>%
  select(-used_at) %>% na.omit()

urls <- urls %>% mutate(type = ifelse(type == "aggregator", "portal", type))

urls %>% pull(caseid) %>% n_distinct()

parsed <- urls %>% pull(url) %>% url_parse() %>% tibble()
suffix <- parsed %>% pull(domain) %>% suffix_extract() %>% tibble()
urls <- bind_cols(urls %>% select(-domain, -path),
  parsed %>% select(path), suffix) %>%
  select(-suffix, -subdomain, -host) %>% 
  mutate(path = if_else(is.na(path), "", path)) %>% 
  filter(!is.na(domain))


political_keywords <- read_csv(
  paste0(data_dir, "PoliticalKeyWords.csv"),
  col_names = FALSE)
political_keywords <- political_keywords %>%
  pull(X1) %>% str_c(collapse = "|")

urls <- urls %>% mutate(political = 1 * str_detect(path, political_keywords),
  political = ifelse(is.na(political), 0, political))
# it's just catching NA paths

urls <- urls %>% mutate(portal = 1 * (type == "portal"))


### START Time-based analysis

legacy_list <- c("cable television", "foreign newspaper", "foreign television", "local newspaper", "local radio", "local television", "magazine",
  "national newspaper", "national television", "radio", "student newspaper",
  "wire service")
online_list <- c("foreign online", "online", "online newspaper")
polling_list <- c("fivethirtyeight", "realclearpolitics", "rasmussenreports")

urls <- urls %>% mutate(
  domain_cat = case_when(
    type %in% legacy_list ~ "legacy",
    domain %in% polling_list ~ "polling",
    type %in% online_list ~ "online",
    type == "portal" ~ "portal",
    TRUE ~ "OTHER"
    ))

portals <- urls %>% filter(domain_cat == "portal") %>% 
  mutate(news = case_when(
    domain == "yahoo" & str_detect(url, "/news/|news.yahoo") ~ 1,
    domain == "msn" & str_detect(url, "/news/") ~ 1,
    domain == "aol" & str_detect(url, "/news/") ~ 1,
    TRUE ~ 0
  )) %>% 
  group_by(caseid) %>% 
  summarize(
    all = sum(duration),
    news = sum(duration[news == 1]),
    political = sum(duration[political == 1])
  ) %>% 
  filter(all > 0) %>% 
  ungroup() %>% 
  mutate(news = news / all, political = political / all) %>% 
  summarize(news = mean(news), political = mean(political))

# # approximately 30 percent of traffic to non-portal (i.e., not Yahoo!, MSN, or AOL) news domains in our sample was for political news. In contrast, just 6 percent of traffic to portals is coded as political news
# urls %>% 
#   group_by(caseid, portal) %>% 
#   summarize(
#     all = n(),
#     political = sum(political)
#   ) %>% 
#   ungroup() %>% 
#   filter(all > 0) %>% 
#   mutate(political = political / all) %>% 
#   lm(political ~ portal, data = .)


visits <- urls %>% 
  group_by(caseid, domain_cat) %>% 
  summarize(
    all = n(),
    political = sum(political)
  ) %>% 
  ungroup() %>% 
  filter(all > 0) %>% 
  mutate(political = political / all) %>% 
  lm(political ~ domain_cat - 1, data = .)

time <- urls %>% 
  group_by(caseid, domain_cat) %>% 
  summarize(
    all = sum(duration),
    political = sum(duration[political == 1])
  ) %>% 
  ungroup() %>% 
  filter(all > 0) %>% 
  mutate(political = political / all) %>% 
  lm(political ~ domain_cat - 1, data = .)

tab_visits <- coeftest(visits, vcovHC(visits, type = "HC1"))
tab_time <- coeftest(time, vcovHC(time, type = "HC1"))

stargazer(time, visits,
  type = "latex", style = "ajps",
  out = "../results/tables/traffic_by_domain_cat.tex",
  digits = 2,
  covariate.labels = c("Legacy", "Online", "Polling", "Portal"),
  dep.var.labels = "Political News Share of ",
  column.labels = c("Time", "URL Visits"),
  model.numbers = FALSE,
  se = list(tab_time[, 2], tab_visits[, 2]),
  omit.table.layout = "sn",
  label = "t:traffic_by_domain_cat",
  star.char = "",
  title = "Average fraction of time and URL visits dedicated to political news visits by domain category (among people who visit sites within that category). By either measure, less than ten percent of portal traffic is dedicated to reading political news, while political news makes up a much larger share of traffic on legacy, online, and polling news sources."
)


### END Time-based analysis









### Calculate time spent on portals for heavy portal users

HPUs <- urls %>% filter(political == 1) %>% group_by(caseid) %>% 
  summarise(pfrac = mean(portal)) %>% 
  filter(pfrac >= 0.5) %>% 
  pull(caseid)

urls %>% filter(caseid %in% HPUs, portal == 1) %>% 
  group_by(caseid) %>% 
  summarise(mean_political = mean(political)) %>% 
  pull(mean_political) %>% 
  quantile(seq(0, 1, 0.1)) %>% 
  round(2)

###


############# Political Recall
pol <- urls %>% filter(political == 1) %>% select(url, path, political) %>%
  distinct() %>% pull(path)

pol_keys <- str_split(political_keywords, fixed("|"))[[1]]
# these words are captured by stems already in the list
pol_keys <- pol_keys[!pol_keys %in% c("electoral", "presidential", "election",  "congressman", "congresswoman", "congressperson","politics", "political")]

contains <- array(0, dim = c(length(pol), length(pol_keys)))
for (i in 1:length(pol)) {
  for (j in 1:length(pol_keys)) {
    contains[i, j] <- 1 * str_detect(pol[i], pol_keys[j])
  }
}

contains_one <- 1 * (rowSums(contains) == 1)

only_one_rate <- 100 * colMeans((contains_one %o% rep(1, length(pol_keys))) * contains)

df <- tibble(pol_keys, only_one_rate) %>%
  arrange(-only_one_rate)

library(xtable)
df %>% 
  rename(Keyword = pol_keys, `Solo Keyword Rate (%)` = only_one_rate) %>% 
  xtable(
    caption = "Political keyword list sensitivity. Solo Keyword Rate refers to how often that keyword is the only keyword from our list in the URL's path, and should be seen as a measure of how sensitive the list is to the inclusion of that particular keyword.") %>% 
  print(
    tabular.environment = "longtable",
    floating = FALSE,
    include.rownames = FALSE,
    table.placement = "H",
    file = "../results/tables/keyword_list_sensitivity.tex")

############# DOMAIN CATEGORIES

legacy_list <- c("cable television", "foreign newspaper", "foreign television", "local newspaper", "local radio", "local television", "magazine",
  "national newspaper", "national television", "radio", "student newspaper",
  "wire service")
online_list <- c("foreign online", "online", "online newspaper")
polling_list <- c("fivethirtyeight", "realclearpolitics", "rasmussenreports")

urls <- urls %>% mutate(
  domain_cat = case_when(
    type %in% legacy_list ~ "legacy",
    domain %in% polling_list ~ "polling",
    type %in% online_list ~ "online",
    type == "portal" ~ "portal",
    TRUE ~ "OTHER"
    ))

a <- urls %>% pull(domain_cat) %>% table() %>% prop.table() %>%
  round(2)
b <- urls %>% filter(political == 1) %>% 
  pull(domain_cat) %>% table() %>% prop.table() %>%
  round(2)

temp <- bind_rows(a, b)
temp <- as_tibble(cbind(domain_cat = names(temp), t(temp)))
temp <- temp %>% rename(all = V2, political = V3)

temp %>% 
  head(n = 8) %>% 
  rename(`All News Domain URLs` = all,
    `Just Political News URLs` = political, `Category` = domain_cat) 

temp %>% write_csv("../results/traffic_breakdown_domain_cat.csv")
temp %>% 
  as.data.frame() %>% 
  stargazer(summary = FALSE,
    rownames = FALSE,
    style = "ajps", type = "latex",
    out = "../results/tables/category_breakdown.tex",
    table.placement = "H",
    title = "Media diet by domain category for all news domain traffic (left) and only political news (right).")

tab <- urls %>% 
  group_by(caseid, domain_cat) %>%
  summarise(mean_pol = mean(political)) %>% 
  ungroup() %>% 
  group_by(domain_cat) %>% 
  summarise(mean_pol = mean(mean_pol)) %>% 
  mutate(mean_pol = round(mean_pol, 2))

tab %>% rename(`Domain Category` = domain_cat,
  `Fraction Political` = mean_pol) %>% 
  as.data.frame() %>% 
  stargazer(summary = FALSE,
    rownames = FALSE,
    digits = 2,
    style = "ajps", type = "latex",
    out = "../results/tables/category_fraction_political.tex",
    table.placement = "H",
    title = "Fraction of URLs which are coded as political news URLs by domain category.")


#############

urls %>% pull(portal) %>% mean() %>% round(2)
urls %>% filter(portal == 1) %>% pull(political) %>% mean() %>% round(2)
urls %>% filter(portal == 0) %>% pull(political) %>% mean() %>% round(2)

urls <- urls %>% filter(political == 1) %>% select(-political)

urls %>% pull(caseid) %>% n_distinct()


urls <- urls %>% arrange(desc(duration)) %>%
  distinct(caseid, domain, date, path, .keep_all = TRUE) %>% 
  arrange(caseid, date)

counts <- urls %>% group_by(caseid) %>%
  summarise(count = n() / 99) %>% 
  arrange(desc(count))


urls %>% write_csv("../results/political_urls.csv")